In [1]:
% matplotlib inline
import pandas as pd
from dateutil.relativedelta import relativedelta
import statsmodels.formula.api as sm
import requests
import pickle

Find harassed editors

Any editor who ever made an edit and received a harassing comment by another user on their user page.


In [20]:
df_all_users = pd.read_csv("../../data/retention/user_start.tsv", "\t")
print(df_all_users.shape)
print(len(df_all_users.user_id.unique()))


(9131778, 2)
9131778

In [5]:
# load data for resolving user pagetitles to user_ids
# only works for ns=user

df_i2ns = pd.read_csv("../../data/retention/user_id_to_names.tsv", "\t")
print(df_i2ns.shape)
# can't deal with different ids taking on the same username at differnt times
df_i2ns = df_i2ns.drop_duplicates("user_text")
print(df_i2ns.shape)
df_i2ns = df_i2ns.rename(columns={'user_id': 'to_user_id', 'user_text': 'to_user_text'})

def resolve_page_title(df):
    df['to_user_text'] = df['to_user_text'].apply(lambda x: str(x).split("/")[0])
    df = df.merge(df_i2ns, how = "left", on = "to_user_text")
    del df['to_user_text']
    return df


(9119417, 2)
(9110021, 2)

In [6]:
# get unique set of user talk pages with an attack
usecols = [0, 3,5,6,8,9,11,12,13]
years = range(2001,2016)
threshold = 0.425

dfs = []

for year in years:

        df = pd.read_csv("../../data/figshare/scored/comments_user_%d.tsv.gz" % year,
                         sep = "\t",
                         compression = "gzip",
                         usecols = usecols)
        
        df = df.query("bot == 0 and admin == 0")
        
        df = df.rename(columns={'user_id': 'from_user_id', 
                                'user_text': 'from_user_text', 
                                'page_title': 'to_user_text'})
        
        df = df.query("pred_attack_score > %f \
                       or pred_aggression_score > %f \
                       or pred_toxicity_score > %f" % (threshold, threshold, threshold))
        
        df = df[['to_user_text', 'from_user_id']]
        
        # get to_user_id
        df = resolve_page_title(df).dropna()
        # remove comments by user on own page
        df = df.query("from_user_id != to_user_id")
        # grab ids of attacked users
        df = df[['to_user_id']]
        
        dfs.append(df)
        print(df.shape)
        
        
df_attacked_users = pd.concat(dfs).drop_duplicates()
df_attacked_users.columns = ['user_id']
print("Num atttacked pages: ", df_attacked_users.shape[0])

# get user start dates
df_attacked_users = df_attacked_users.merge(df_all_users, on = 'user_id')
df_attacked_users['first_edit_day'] = pd.to_datetime(df_attacked_users['first_edit_day'], format = '%Y%m%d')
df_attacked_users = df_attacked_users.dropna()
print( df_attacked_users.shape[0])

# save df
df_attacked_users['sample'] = "attacked"
df_attacked_users.to_csv("../../data/retention/attacked_users.csv", index = False)
print("Num atttacked users: ", df_attacked_users.shape[0])


(1, 1)
(58, 1)
(350, 1)
(1205, 1)
(4470, 1)
(14678, 1)
(17473, 1)
(14282, 1)
(10519, 1)
(8304, 1)
(6521, 1)
(6481, 1)
(5648, 1)
(5825, 1)
(5401, 1)
Num atttacked pages:  27690
27690
Num atttacked users:  27690

Random User Sample

Random sample of users who made at least one edit at some point.


In [7]:
n_random = 100000
df_random_users = df_all_users.sample(n_random, random_state = 12)
df_random_users['sample'] = "random"
df_random_users['first_edit_day'] = pd.to_datetime(df_all_users['first_edit_day'], format = '%Y%m%d')
df_random_users = df_random_users.dropna()
df_random_users.to_csv("../../data/retention/random_users.csv", index = False)
print("Sample Size: ", df_random_users.shape[0])


Sample Size:  100000

Load user history data for newcomer sample

The data used in this analysis includes:

  1. all user and article talk page comments, labeled by harassment classifiers, except those generated by bots or templates
  2. all newly registered users, who made at least one edit
  3. edits per day per namespace for all newcomers
  4. user warnings received by 2015 newcomers in
  5. genders of all editors if available

In [8]:
df_sample_users = pd.concat([df_random_users, df_attacked_users]).drop_duplicates()
df_sample_users["last_day"] = df_sample_users["first_edit_day"] + pd.Timedelta('186 days')

In [9]:
# get comments from users in sample for first 6 months
nss = ['user', 'article']

dfs = []

for year in years:
    for ns in nss:

        df = pd.read_csv("../../data/figshare/scored/comments_%s_%d.tsv.gz" % (ns, year),
                         sep = "\t",
                         compression = "gzip",
                         usecols = usecols)
        
        df = df.query("bot == 0 and admin == 0")
        
        df = df.rename(columns={'user_id': 'from_user_id', 
                                'user_text': 'from_user_text', 
                                'page_title': 'to_user_text'})
        
        df['ns'] = ns
        df['timestamp'] = pd.to_datetime(df['timestamp'])
        
        if ns == "user":
            df = resolve_page_title(df)
        else:
            df['to_user_id'] = -1
            del df['to_user_text']
            
        df = df.query("from_user_id != to_user_id")
        
        
        # comments made by users in the sample in 6 months since first edit
        df = df.merge(df_sample_users[['user_id', 'last_day']], how = 'inner', left_on = "from_user_id", right_on = 'user_id')
        del df['user_id']
        df = df.query("timestamp < last_day")
        dfs.append(df)
        
        
df_comments_from = pd.concat(dfs).drop_duplicates("rev_id")
del df_comments_from['last_day']
print(df_comments_from.shape[0])


2354123

In [10]:
# get comments to users in sample for first 6 month
dfs = []

for year in years:

    df = pd.read_csv("../../data/figshare/scored/comments_user_%d.tsv.gz" %  year,
                     sep = "\t",
                     compression = "gzip",
                     usecols = usecols)

    df = df.query("bot == 0 and admin == 0")

    df = df.rename(columns={'user_id': 'from_user_id', 
                            'user_text': 'from_user_text', 
                            'page_title': 'to_user_text'})

    df['ns'] = 'user'
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df = resolve_page_title(df)
    df = df.query("from_user_id != to_user_id")


    df = df.merge(df_sample_users[['user_id','last_day']], how = 'inner', left_on = 'to_user_id', right_on = 'user_id')
    del df['user_id']
    df = df.query("timestamp < last_day")
    dfs.append(df)

df_comments_to = pd.concat(dfs).drop_duplicates("rev_id")
del df_comments_to['last_day']
print(df_comments_to.shape[0])


832604

In [11]:
# load edits per day for editors in sample
df_edits = pd.read_csv("../../data/retention/daily_revision_counts.tsv", "\t")
print(df_edits.shape[0])
df_edits = df_edits.merge(df_sample_users, how = 'inner', on = 'user_id')
df_edits['timestamp'] = pd.to_datetime(df_edits['day'].apply(lambda x: str(x)))
print(df_edits.shape[0])
df_edits = df_edits.query("timestamp < last_day")
print(df_edits.shape[0])


92282221
32422391
3274798

In [12]:
# load user warnings for editors in sample
df_uw = pd.read_csv("../../data/retention/user_warnings.tsv", "\t")

df_uw = df_uw.rename(columns={'user_id': 'from_user_id', 
                                'user_text': 'from_user_text', 
                                'page_title': 'to_user_text'})

df_uw = resolve_page_title(df_uw)
df_uw = df_uw.merge(df_sample_users[['user_id','last_day']], how = 'inner', left_on = 'to_user_id', right_on = 'user_id')
del df_uw['user_id']
df_uw['timestamp'] = pd.to_datetime(df_uw['rev_timestamp'])
df_uw = df_uw.query("timestamp < last_day")
print(df_uw.shape[0])


39107

In [13]:
# create df of consolidated user level features
df_gender = pd.read_csv("../../data/misc/genders.tsv", "\t")[['user_id', 'gender']]
df_user = df_sample_users.merge(df_gender, on = 'user_id', how = "left")
df_user['gender'] = df_user['gender'].fillna('unknown')

Create and Pickle User Objects

To be able help with extracting user level features for subsequent studies, we group data sources above by user and store the results in a dedicated User object.


In [14]:
# map data frames into dictionaries keyed by user
def gb_to_dict(gb):
    return { i:k for i,k in gb}

df_comments_from_groups = gb_to_dict(df_comments_from.groupby("from_user_id"))
df_comments_to_groups =  gb_to_dict(df_comments_to.query("ns == 'user'").groupby("to_user_id"))
df_edits_groups =  gb_to_dict(df_edits.groupby("user_id"))
df_user_groups =  gb_to_dict(df_user.groupby("user_id"))
df_uw_groups =  gb_to_dict(df_uw.groupby("to_user_id")) # page title is the recipient of the uw

In [15]:
# collect User objects 
%load_ext autoreload
%autoreload 2
from user_object import User
import pickle

In [16]:
attacked_user_objects = [User( user_id,
                      df_comments_from_groups,
                      df_comments_to_groups,
                      df_edits_groups,
                      df_user_groups, 
                      df_uw_groups) 
                for user_id in df_attacked_users['user_id']]

pickle.dump(attacked_user_objects, open("../../data/retention/attacked_user_data.pkl", "wb"))

In [17]:
random_user_objects = [User( user_id,
                      df_comments_from_groups,
                      df_comments_to_groups,
                      df_edits_groups,
                      df_user_groups, 
                      df_uw_groups) 
                for user_id in df_random_users['user_id']]

pickle.dump(random_user_objects, open("../../data/retention/random_user_data.pkl", "wb"))

In [ ]: